Description of features:

Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
In [60]:
# custom plot settings

size=20
params = {'legend.fontsize': 'large',
          'legend.title_fontsize': size*0.75,
          'legend.frameon': False,
          'figure.figsize': (8,4),
          'axes.labelsize': size,
          'axes.titlesize': size,
          'xtick.labelsize': size*0.75,
          'ytick.labelsize': size*0.75,
          'axes.titlepad': 10,
          'figure.subplot.hspace': 0.9,
          'font.sans-serif': 'Arial'}
# matplotlib.rcParams.keys() # to see editable options
plt.rcParams.update(params)

Import and explore data

In [86]:
# import data
df = pd.read_excel('../data/gen/collapsed_data.xlsx')

# create amplitude ratio feature
df['amp_ratio'] = df.amplitude_1 / df.amplitude_3

# create 'area under curve' feature (not really area, but correlated)
df['auc_1'] = df.amplitude_1 * df.duration_2
df['auc_2'] = df.amplitude_3 * (df.duration_3 - df.duration_2)

# add gender to df
male_list = [True, True, True, False, True, True, False, True, True, False]
df['male'] = df.person_id.apply(lambda x: male_list[x-1])
pd.crosstab(df.person_id, df.male)

# check df
print(df.isnull().sum())
display(df.describe())
df.head()
observation         0
person_id           0
swallow_id          0
swallow_volume      0
duration_1          0
duration_2          0
duration_3          0
amplitude_1         0
amplitude_2         0
amplitude_3         0
slope_1             0
slope_2             0
slope_3             0
curvature           0
kernel_1            0
kernel_2            0
emg               420
amp_ratio           0
auc_1               0
auc_2               0
male                0
dtype: int64
observation person_id swallow_id swallow_volume duration_1 duration_2 duration_3 amplitude_1 amplitude_2 amplitude_3 slope_1 slope_2 slope_3 curvature kernel_1 kernel_2 emg amp_ratio auc_1 auc_2
count 600.000000 600.000000 600.000000 600.00000 600.000000 600.000000 600.000000 600.000000 600.000000 600.000000 600.000000 600.000000 600.000000 600.000000 600.000000 600.000000 180.000000 600.000000 600.000000 600.000000
mean 300.500000 5.500000 30.500000 17.50000 0.370100 0.743950 0.890000 0.018075 0.013978 0.016094 -3.007653 -0.898699 -0.104684 3.579869 -1.383693 -1.716930 25.020863 1.465841 0.011640 0.002690
std 173.349358 2.874678 17.332552 8.54625 0.116865 0.363889 0.345599 0.037822 0.032150 0.034921 9.444107 5.486877 3.857271 10.960481 4.163572 8.164709 8.237387 13.409624 0.027156 0.007761
min 1.000000 1.000000 1.000000 5.00000 0.140000 0.180000 0.200000 0.000078 -0.004357 -0.002564 -87.060601 -57.026369 -47.126710 -0.809382 -37.508637 -114.251113 11.835165 -148.756914 0.000016 -0.001004
25% 150.750000 3.000000 15.750000 10.00000 0.300000 0.460000 0.640000 0.000960 0.000224 0.000274 -0.640922 -0.105458 0.007777 0.090265 -0.395089 -0.314073 19.104038 0.853511 0.000597 0.000006
50% 300.500000 5.500000 30.500000 17.50000 0.340000 0.590000 0.820000 0.002286 0.000863 0.000840 -0.165830 -0.004063 0.040500 0.219980 -0.086981 -0.025378 22.334834 1.114600 0.001806 0.000103
75% 450.250000 8.000000 45.250000 25.00000 0.390000 1.032500 1.210000 0.021695 0.013856 0.014584 -0.071081 0.050745 0.096178 0.899968 -0.036277 0.000034 30.283716 2.415636 0.010717 0.000921
max 600.000000 10.000000 60.000000 30.00000 0.820000 1.710000 1.740000 0.261093 0.243388 0.244349 0.790131 11.661284 10.626430 99.706008 0.286400 0.744584 51.222534 163.853877 0.276759 0.063078
Out[86]:
observation person_id swallow_id swallow_volume duration_1 duration_2 duration_3 amplitude_1 amplitude_2 amplitude_3 slope_1 slope_2 slope_3 curvature kernel_1 kernel_2 emg amp_ratio auc_1 auc_2 male
0 1 1 1 5 0.54 0.70 0.73 0.015030 0.012916 0.012958 -0.419831 -0.211804 -0.155339 0.652619 -0.302997 -0.924259 NaN 1.159940 0.010521 0.000389 True
1 2 1 2 5 0.56 0.91 1.12 0.022678 0.015516 0.017619 -0.293227 -0.035574 0.192503 0.473737 -0.213068 -0.415240 NaN 1.287175 0.020637 0.003700 True
2 3 1 3 5 0.47 0.70 0.95 0.017288 0.012654 0.016443 -0.471205 -0.060896 0.362103 0.730759 -0.336938 -0.384147 NaN 1.051385 0.012102 0.004111 True
3 4 1 4 5 0.28 0.83 0.88 0.021712 0.010454 0.010512 -4.107379 1.208510 1.123713 5.149526 -2.087322 -1.744316 NaN 2.065506 0.018021 0.000526 True
4 5 1 5 5 0.23 0.47 0.51 0.021861 0.011751 0.011933 -9.487379 1.636070 2.322143 10.839715 -4.064915 -6.749040 NaN 1.832013 0.010275 0.000477 True
In [87]:
# Mean of each feature by swallow volume

dfmean = df.groupby('swallow_volume').agg('mean').reset_index()

for x in [v for v in dfmean.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
    plt.scatter(dfmean.swallow_volume, dfmean[x])
    plt.title(x)
    plt.show()
In [88]:
for x in [v for v in df.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
    display(df.boxplot(column=[x], by=['swallow_volume']))
<AxesSubplot:title={'center':'duration_1'}, xlabel='[swallow_volume]'>
<AxesSubplot:title={'center':'duration_2'}, xlabel='[swallow_volume]'>
<AxesSubplot:title={'center':'duration_3'}, xlabel='[swallow_volume]'>
<AxesSubplot:title={'center':'amplitude_1'}, xlabel='[swallow_volume]'>
<AxesSubplot:title={'center':'amplitude_2'}, xlabel='[swallow_volume]'>
<AxesSubplot:title={'center':'amplitude_3'}, xlabel='[swallow_volume]'>
<AxesSubplot:title={'center':'slope_1'}, xlabel='[swallow_volume]'>
<AxesSubplot:title={'center':'slope_2'}, xlabel='[swallow_volume]'>
<AxesSubplot:title={'center':'slope_3'}, xlabel='[swallow_volume]'>
<AxesSubplot:title={'center':'curvature'}, xlabel='[swallow_volume]'>
<AxesSubplot:title={'center':'kernel_1'}, xlabel='[swallow_volume]'>
<AxesSubplot:title={'center':'kernel_2'}, xlabel='[swallow_volume]'>
<AxesSubplot:title={'center':'emg'}, xlabel='[swallow_volume]'>
<AxesSubplot:title={'center':'amp_ratio'}, xlabel='[swallow_volume]'>
<AxesSubplot:title={'center':'auc_1'}, xlabel='[swallow_volume]'>
<AxesSubplot:title={'center':'auc_2'}, xlabel='[swallow_volume]'>

Compare features by gender

In [89]:
for x in [v for v in df.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
    display(df.boxplot(column=[x], by=['male']))
    plt.title('')
    plt.show()
<AxesSubplot:title={'center':'duration_1'}, xlabel='[male]'>
<AxesSubplot:title={'center':'duration_2'}, xlabel='[male]'>
<AxesSubplot:title={'center':'duration_3'}, xlabel='[male]'>
<AxesSubplot:title={'center':'amplitude_1'}, xlabel='[male]'>
<AxesSubplot:title={'center':'amplitude_2'}, xlabel='[male]'>
<AxesSubplot:title={'center':'amplitude_3'}, xlabel='[male]'>
<AxesSubplot:title={'center':'slope_1'}, xlabel='[male]'>
<AxesSubplot:title={'center':'slope_2'}, xlabel='[male]'>
<AxesSubplot:title={'center':'slope_3'}, xlabel='[male]'>
<AxesSubplot:title={'center':'curvature'}, xlabel='[male]'>
<AxesSubplot:title={'center':'kernel_1'}, xlabel='[male]'>
<AxesSubplot:title={'center':'kernel_2'}, xlabel='[male]'>
<AxesSubplot:title={'center':'emg'}, xlabel='[male]'>
<AxesSubplot:title={'center':'amp_ratio'}, xlabel='[male]'>
<AxesSubplot:title={'center':'auc_1'}, xlabel='[male]'>
<AxesSubplot:title={'center':'auc_2'}, xlabel='[male]'>
In [ ]:
 
In [ ]:
 

Demeaned data

In [90]:
# demean function 

def demean(df, var = ['person_id']):
    """Demean df grouped by var, and divides by std dev. Returns a df.
    """
    # get means
    dfmeans = df.groupby(var).agg('mean').reset_index()
    
    # get std devs
    dfstd = df.groupby(var).agg('std').reset_index()
    
    # loop through cols and demean
    for x in [v for v in df.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
        dftmp = dfmeans[var + [x]]
        dftmp = pd.concat([dftmp, dfstd[x]], 1)
        dftmp.columns = var + ['xmean', 'xstd']
        df = df.merge(dftmp, on=var, how='inner')
        df[x] = (df[x] - df.xmean) / df.xstd
        df.drop(['xmean', 'xstd'], 1, inplace=True)
    
    return df

dfdemean = demean(df, ['person_id'])
dfdemean.head()
Out[90]:
observation person_id swallow_id swallow_volume duration_1 duration_2 duration_3 amplitude_1 amplitude_2 amplitude_3 slope_1 slope_2 slope_3 curvature kernel_1 kernel_2 emg amp_ratio auc_1 auc_2 male
0 1 1 1 5 2.630209 0.382084 0.021820 -1.282420 -1.093047 -1.244876 1.147441 0.390064 0.121480 -1.164842 1.174876 0.363168 NaN -0.221308 -0.861366 -0.749815 True
1 2 1 2 5 2.848786 1.107123 1.237530 -1.167598 -1.050642 -1.166034 1.153719 0.400971 0.150521 -1.172576 1.185194 0.387549 NaN 0.201259 -0.691698 -0.538310 True
2 3 1 3 5 1.865189 0.382084 0.707605 -1.248523 -1.097313 -1.185918 1.144894 0.399404 0.164681 -1.161464 1.170982 0.389039 NaN -0.581839 -0.834858 -0.512064 True
3 4 1 4 5 -0.211291 0.830918 0.489401 -1.182096 -1.133196 -1.286250 0.964579 0.477963 0.228267 -0.970425 0.970165 0.323889 NaN 2.786213 -0.735570 -0.741072 True
4 5 1 5 5 -0.757733 -0.412006 -0.663964 -1.179861 -1.112046 -1.262212 0.697789 0.504424 0.328322 -0.724417 0.743281 0.084172 NaN 2.010749 -0.865498 -0.744156 True
In [91]:
# plot demeaned boxplots

for x in [v for v in dfdemean.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
    dfdemean.boxplot(column=[x], by=['swallow_volume'])
In [92]:
import seaborn as sns

# get subset
dfsub = dfdemean.loc[(dfdemean.person_id <= 5) | (dfdemean.person_id == 7), :]

for y in [v for v in dfsub.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
    print(y)
    
    # Plot
    sns.set(font_scale=1.5)
    sns.set_style("ticks")
    fig = plt.figure(figsize=(8,5))
    ax1 = fig.add_subplot(111) # row, col, num
    sns.stripplot(x='swallow_volume', y=y, hue='person_id', 
                  data=dfsub, dodge=True, jitter=True, alpha=.40, 
                  zorder=1, size=8)
    sns.pointplot(x='swallow_volume', y=y, ci='sd', 
                  data=dfsub, join=False, scale=1.5, 
                  zorder=100, color='black')
    ax1.set_xlabel('volume (mL)')
    ax1.set_ylabel(y)
    ax1.legend(title='person id', bbox_to_anchor=(1.01, 0.5), loc='center left', fontsize='xx-small')
duration_1
duration_2
duration_3
amplitude_1
amplitude_2
amplitude_3
slope_1
slope_2
slope_3
curvature
kernel_1
kernel_2
emg
amp_ratio
auc_1
auc_2
In [124]:
# subplot with amp1, amp2, ampr, auc1

# get data subset
dfsub = dfdemean.loc[(dfdemean.person_id == 4) & (dfdemean.swallow_volume <= 15)]

# init plot
sns.set_style("ticks")
fig, ax = plt.subplots(2, 2, figsize=(12,8))

# amp1
y = 'amplitude_1'
sns.stripplot(ax=ax[0,0], x='swallow_volume', y=y, hue='person_id', 
      data=dfsub, dodge=True, jitter=True, alpha=.40, 
      zorder=1)
sns.pointplot(ax=ax[0,0], x='swallow_volume', y=y, ci='sd', 
      data=dfsub, join=False, 
      zorder=100, color='black')

# amp2
y = 'amplitude_2'
sns.stripplot(ax=ax[0,1], x='swallow_volume', y=y, hue='person_id', 
      data=dfsub, dodge=True, jitter=True, alpha=.40, 
      zorder=1)
sns.pointplot(ax=ax[0,1], x='swallow_volume', y=y, ci='sd', 
      data=dfsub, join=False, 
      zorder=100, color='black')

# ampr
y = 'amp_ratio'
sns.stripplot(ax=ax[1,0], x='swallow_volume', y=y, hue='person_id', 
      data=dfsub, dodge=True, jitter=True, alpha=.40, 
      zorder=1)
sns.pointplot(ax=ax[1,0], x='swallow_volume', y=y, ci='sd', 
      data=dfsub, join=False, 
      zorder=100, color='black')

# auc1
y = 'auc_1'
sns.stripplot(ax=ax[1,1], x='swallow_volume', y=y, hue='person_id', 
      data=dfsub, dodge=True, jitter=True, alpha=.40, 
      zorder=1)
sns.pointplot(ax=ax[1,1], x='swallow_volume', y=y, ci='sd', 
      data=dfsub, join=False, 
      zorder=100, color='black')

ax[0,0].title.set_text('Amplitude of First Peak')
ax[0,1].title.set_text('Amplitude of Second Peak')
ax[1,0].title.set_text('Ratio of Peak Amplitudes')
ax[1,1].title.set_text('Area Under Curve, First Peak')

# all units are 'normalized' to be mean zero, stddev = 1
ax[0,0].set_ylabel('Normalized Units')
ax[0,1].set_ylabel('')
ax[1,0].set_ylabel('Normalized Units')
ax[1,1].set_ylabel('')
# ax[0,0].set_ylabel('Normalized Amplitude')
# ax[0,1].set_ylabel('Normalized Amplitude')
# ax[1,0].set_ylabel('% Size')
# ax[1,1].set_ylabel('Normalized Area')

for i in range(2):
    for j in range(2):
        if i == 0:
            ax[i, j].set_xlabel('')
        else:
            ax[i, j].set_xlabel('Swallow Volume (ml)')
        ax[i,j].get_legend().remove()

fig.tight_layout()
In [123]:
# subplot with amp1, amp2, ampr, auc1

# get data subset
dfsub = dfdemean.loc[(dfdemean.person_id == 4) & (dfdemean.swallow_volume > 0)]

# init plot
sns.set_style("ticks")
fig, ax = plt.subplots(2, 2, figsize=(12,8))

# amp1
y = 'amplitude_1'
sns.stripplot(ax=ax[0,0], x='swallow_volume', y=y, hue='person_id', 
      data=dfsub, dodge=True, jitter=True, alpha=.40, 
      zorder=1)
sns.pointplot(ax=ax[0,0], x='swallow_volume', y=y, ci='sd', 
      data=dfsub, join=False, 
      zorder=100, color='black')

# amp2
y = 'amplitude_2'
sns.stripplot(ax=ax[0,1], x='swallow_volume', y=y, hue='person_id', 
      data=dfsub, dodge=True, jitter=True, alpha=.40, 
      zorder=1)
sns.pointplot(ax=ax[0,1], x='swallow_volume', y=y, ci='sd', 
      data=dfsub, join=False, 
      zorder=100, color='black')

# ampr
y = 'amp_ratio'
sns.stripplot(ax=ax[1,0], x='swallow_volume', y=y, hue='person_id', 
      data=dfsub, dodge=True, jitter=True, alpha=.40, 
      zorder=1)
sns.pointplot(ax=ax[1,0], x='swallow_volume', y=y, ci='sd', 
      data=dfsub, join=False, 
      zorder=100, color='black')

# auc1
y = 'auc_1'
sns.stripplot(ax=ax[1,1], x='swallow_volume', y=y, hue='person_id', 
      data=dfsub, dodge=True, jitter=True, alpha=.40, 
      zorder=1)
sns.pointplot(ax=ax[1,1], x='swallow_volume', y=y, ci='sd', 
      data=dfsub, join=False, 
      zorder=100, color='black')

ax[0,0].title.set_text('Amplitude of First Peak')
ax[0,1].title.set_text('Amplitude of Second Peak')
ax[1,0].title.set_text('Ratio of Peak Amplitudes')
ax[1,1].title.set_text('Area Under Curve, First Peak')

# all units are 'normalized' to be mean zero, stddev = 1
ax[0,0].set_ylabel('Normalized Units')
ax[0,1].set_ylabel('')
ax[1,0].set_ylabel('Normalized Units')
ax[1,1].set_ylabel('')
# ax[0,0].set_ylabel('Normalized Amplitude')
# ax[0,1].set_ylabel('Normalized Amplitude')
# ax[1,0].set_ylabel('% Size')
# ax[1,1].set_ylabel('Normalized Area')

for i in range(2):
    for j in range(2):
        if i == 0:
            ax[i, j].set_xlabel('')
        else:
            ax[i, j].set_xlabel('Swallow Volume (ml)')
        ax[i,j].get_legend().remove()

fig.tight_layout()
In [129]:
# get subset
dfsub = dfdemean.loc[(dfdemean.person_id <= 7) & (dfdemean.person_id != 6), :]

for y in [v for v in dfsub.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
    print(y)
    
    # Plot
    sns.set(font_scale=1.5)
    sns.set_style("ticks")
    fig = plt.figure(figsize=(8,5))
    ax1 = fig.add_subplot(111) # row, col, num
    sns.stripplot(x='swallow_volume', y=y, hue='person_id', 
                  data=dfsub, dodge=True, jitter=True, alpha=.40, 
                  zorder=1, size=8)
    sns.pointplot(x='swallow_volume', y=y, ci='sd', 
                  data=dfsub, join=False, scale=1.5, 
                  zorder=100, color='black')
    ax1.set_xlabel('volume (mL)')
    ax1.set_ylabel(y)
    ax1.legend(title='person id', bbox_to_anchor=(1.01, 0.5), loc='center left', fontsize='xx-small')
duration_1
duration_2
duration_3
amplitude_1
amplitude_2
amplitude_3
slope_1
slope_2
slope_3
curvature
kernel_1
kernel_2
emg
amp_ratio
auc_1
auc_2
In [125]:
# get subset
dfsub = dfdemean.loc[dfdemean.person_id >= 8, :]

for y in [v for v in dfsub.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
    print(y)
    
    # Plot
    sns.set(font_scale=1.5)
    sns.set_style("ticks")
    fig = plt.figure(figsize=(8,5))
    ax1 = fig.add_subplot(111) # row, col, num
    sns.stripplot(x='swallow_volume', y=y, hue='person_id', 
                  data=dfsub, dodge=True, jitter=True, alpha=.40, 
                  zorder=1, size=8)
    sns.pointplot(x='swallow_volume', y=y, ci='sd', 
                  data=dfsub, join=False, scale=1.5, 
                  zorder=100, color='black')
    ax1.set_xlabel('volume (mL)')
    ax1.set_ylabel(y)
    ax1.legend(title='person id', bbox_to_anchor=(1.01, 0.5), loc='center left', fontsize='xx-small')
duration_1
duration_2
duration_3
amplitude_1
amplitude_2
amplitude_3
slope_1
slope_2
slope_3
curvature
kernel_1
kernel_2
emg
amp_ratio
auc_1
auc_2

Gender differences?

In [130]:
# get subset
dfsub = df.loc[df.person_id >= 0, :]

for y in [v for v in dfsub.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
    print(y)
    
    # Plot
    sns.set(font_scale=1.5)
    sns.set_style("ticks")
    fig = plt.figure(figsize=(8,5))
    ax1 = fig.add_subplot(111) # row, col, num
    sns.stripplot(x='male', y=y, hue='person_id', 
                  data=dfsub, dodge=True, jitter=True, alpha=.40, 
                  zorder=1, size=8)
    sns.pointplot(x='male', y=y, ci='sd', 
                  data=dfsub, join=False, scale=1.5, 
                  zorder=100, color='black')
    ax1.set_xlabel('Male (Boolean)')
    ax1.set_ylabel(y)
    ax1.legend(title='person id', bbox_to_anchor=(1.01, 0.5), loc='center left', fontsize='xx-small')
duration_1
duration_2
duration_3
amplitude_1
amplitude_2
amplitude_3
slope_1
slope_2
slope_3
curvature
kernel_1
kernel_2
emg
amp_ratio
auc_1
auc_2
In [ ]:
 

T-SNE

In [131]:
# fit T-SNE and plot

from sklearn.manifold import TSNE

# get subset of data
dfsub = df.loc[df.person_id >= 0, 
                 [c for c in df.columns if
                  c not in ['observation', 'swallow_id', 'emg']]]

# rescale data
def rescale(vec):
    return (vec - vec.mean()) / vec.std()
for v in [c for c in dfsub.columns if c not in ['person_id', 'swallow_volume']]:
    dfsub[v] = dfsub.groupby(['person_id'])[v].apply(lambda x: rescale(x))

# define X and Y vecs
X = dfsub.iloc[:, 2:-1]
Y = dfsub.loc[:, 'swallow_volume']

# fit model
X_embedded = TSNE(n_components = 2, 
                  perplexity = 50,
                  learning_rate = 100,
                  n_iter = 5000).fit_transform(X, Y)
print(X_embedded.shape)

# merge results
dfm = pd.concat([dfsub.reset_index(drop=True), 
                 pd.DataFrame(X_embedded, columns=['tsne0', 'tsne1'])], 1)
dfm.head()



# plot t-sne, color by person

# color map
cmap = plt.cm.jet

fig, ax = plt.subplots()

scatter = ax.scatter(dfm.tsne0, dfm.tsne1, c=dfm.person_id, cmap=cmap)
ax.legend(*scatter.legend_elements(num=10), ncol=2,
            loc="center left", bbox_to_anchor=(1, 0.5), title="person id")
plt.title('Colored by person id')
plt.show()


# plot t-sne, color by swallow volume

cmap = matplotlib.cm.Blues(np.linspace(0,1,10))
cmap = matplotlib.colors.ListedColormap(cmap[2:,:-1])

fig, ax = plt.subplots()

scatter = ax.scatter(dfm.tsne0, dfm.tsne1, c=dfm.swallow_volume, cmap=cmap)
ax.legend(*scatter.legend_elements(num=6),
            loc="center left", bbox_to_anchor=(1, 0.5), title="swallow volume")
plt.title('Colored by swallow volume')
plt.show()


# plot t-sne, color by gender

fig, ax = plt.subplots()

scatter = ax.scatter(dfm.tsne0, dfm.tsne1, c=df.male, cmap=cmap)
ax.legend(*scatter.legend_elements(num=2),
            loc="center left", bbox_to_anchor=(1, 0.5), title="Male (Boolean)")
plt.title('Colored by gender')
plt.show()
(600, 2)